src/libostree/ostree-repo-static-delta-core.c \
src/libostree/ostree-repo-static-delta-processing.c \
src/libostree/ostree-repo-static-delta-compilation.c \
+ src/libostree/ostree-repo-static-delta-compilation-analysis.c \
src/libostree/ostree-repo-static-delta-private.h \
$(NULL)
if USE_LIBARCHIVE
--- /dev/null
+/* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*-
+ *
+ * Copyright (C) 2015 Colin Walters <walters@verbum.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <gio/gunixoutputstream.h>
+
+#include "ostree-core-private.h"
+#include "ostree-repo-private.h"
+#include "ostree-lzma-compressor.h"
+#include "ostree-repo-static-delta-private.h"
+#include "ostree-diff.h"
+#include "ostree-rollsum.h"
+#include "otutil.h"
+#include "ostree-varint.h"
+
+void
+_ostree_delta_content_sizenames_free (gpointer v)
+{
+ OstreeDeltaContentSizeNames *ce = v;
+ g_free (ce->checksum);
+ g_ptr_array_unref (ce->basenames);
+ g_free (ce);
+}
+
+static gboolean
+build_content_sizenames_recurse (OstreeRepo *repo,
+ OstreeRepoCommitTraverseIter *iter,
+ GHashTable *sizenames_map,
+ GHashTable *include_only_objects,
+ GCancellable *cancellable,
+ GError **error)
+{
+ gboolean ret = FALSE;
+
+ while (TRUE)
+ {
+ OstreeRepoCommitIterResult iterres =
+ ostree_repo_commit_traverse_iter_next (iter, cancellable, error);
+
+ if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_ERROR)
+ goto out;
+ else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_END)
+ break;
+ else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_FILE)
+ {
+ char *name;
+ char *checksum;
+ OstreeDeltaContentSizeNames *csizenames;
+
+ ostree_repo_commit_traverse_iter_get_file (iter, &name, &checksum);
+
+ if (include_only_objects && !g_hash_table_contains (include_only_objects, checksum))
+ continue;
+
+ csizenames = g_hash_table_lookup (sizenames_map, checksum);
+ if (!csizenames)
+ {
+ gs_unref_object GFileInfo *finfo = NULL;
+
+ csizenames = g_new0 (OstreeDeltaContentSizeNames, 1);
+ csizenames->checksum = g_strdup (checksum);
+
+ /* Transfer ownership so things get cleaned up if we
+ * throw an exception below.
+ */
+ g_hash_table_replace (sizenames_map, csizenames->checksum, csizenames);
+
+ if (!ostree_repo_load_file (repo, checksum,
+ NULL, &finfo, NULL,
+ cancellable, error))
+ goto out;
+
+ csizenames->size = g_file_info_get_size (finfo);
+ }
+
+ if (!csizenames->basenames)
+ csizenames->basenames = g_ptr_array_new_with_free_func (g_free);
+ g_ptr_array_add (csizenames->basenames, g_strdup (name));
+ }
+ else if (iterres == OSTREE_REPO_COMMIT_ITER_RESULT_DIR)
+ {
+ char *name;
+ char *content_checksum;
+ char *meta_checksum;
+ gs_unref_variant GVariant *dirtree = NULL;
+ ostree_cleanup_repo_commit_traverse_iter
+ OstreeRepoCommitTraverseIter subiter = { 0, };
+
+ ostree_repo_commit_traverse_iter_get_dir (iter, &name, &content_checksum, &meta_checksum);
+
+ if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_DIR_TREE,
+ content_checksum, &dirtree,
+ error))
+ goto out;
+
+ if (!ostree_repo_commit_traverse_iter_init_dirtree (&subiter, repo, dirtree,
+ OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
+ error))
+ goto out;
+
+ if (!build_content_sizenames_recurse (repo, &subiter,
+ sizenames_map, include_only_objects,
+ cancellable, error))
+ goto out;
+ }
+ else
+ g_assert_not_reached ();
+ }
+ ret = TRUE;
+ out:
+ return ret;
+}
+
+static int
+compare_sizenames (const void *a,
+ const void *b)
+{
+ OstreeDeltaContentSizeNames *sn_a = *(OstreeDeltaContentSizeNames**)(void*)a;
+ OstreeDeltaContentSizeNames *sn_b = *(OstreeDeltaContentSizeNames**)(void*)b;
+
+ return sn_a->size - sn_b->size;
+}
+
+/**
+ * Generate a sorted array of [(checksum: str, size: uint64, names: array[string]), ...]
+ * for regular file content.
+ */
+static gboolean
+build_content_sizenames_filtered (OstreeRepo *repo,
+ GVariant *commit,
+ GHashTable *include_only_objects,
+ GPtrArray **out_sizenames,
+ GCancellable *cancellable,
+ GError **error)
+{
+ gboolean ret = FALSE;
+ gs_unref_ptrarray GPtrArray *ret_sizenames =
+ g_ptr_array_new_with_free_func (_ostree_delta_content_sizenames_free);
+ gs_unref_hashtable GHashTable *sizenames_map =
+ g_hash_table_new_full (g_str_hash, g_str_equal, NULL, _ostree_delta_content_sizenames_free);
+ ostree_cleanup_repo_commit_traverse_iter
+ OstreeRepoCommitTraverseIter iter = { 0, };
+
+ if (!ostree_repo_commit_traverse_iter_init_commit (&iter, repo, commit,
+ OSTREE_REPO_COMMIT_TRAVERSE_FLAG_NONE,
+ error))
+ goto out;
+
+ if (!build_content_sizenames_recurse (repo, &iter, sizenames_map, include_only_objects,
+ cancellable, error))
+ goto out;
+
+ { GHashTableIter hashiter;
+ gpointer hkey, hvalue;
+
+ g_hash_table_iter_init (&hashiter, sizenames_map);
+ while (g_hash_table_iter_next (&hashiter, &hkey, &hvalue))
+ {
+ g_hash_table_iter_steal (&hashiter);
+ g_ptr_array_add (ret_sizenames, hvalue);
+ }
+ }
+
+ g_ptr_array_sort (ret_sizenames, compare_sizenames);
+
+ ret = TRUE;
+ gs_transfer_out_value (out_sizenames, &ret_sizenames);
+ out:
+ return ret;
+}
+
+static gboolean
+string_array_nonempty_intersection (GPtrArray *a,
+ GPtrArray *b)
+{
+ guint i;
+ for (i = 0; i < a->len; i++)
+ {
+ guint j;
+ const char *a_str = a->pdata[i];
+ for (j = 0; j < b->len; j++)
+ {
+ const char *b_str = b->pdata[j];
+ if (strcmp (a_str, b_str) == 0)
+ return TRUE;
+ }
+ }
+ return FALSE;
+}
+
+/*
+ * Build up a map of files with matching basenames and similar size,
+ * and use it to find apparently similar objects.
+ *
+ * @new_reachable_regfile_content is a Set<checksum> of new regular
+ * file objects.
+ *
+ * Currently, @out_modified_regfile_content will be a Map<to checksum,from checksum>;
+ * however in the future it would be easy to have this function return
+ * multiple candidate matches. The hard part would be changing
+ * the delta compiler to iterate over all matches, determine
+ * a cost for each one, then pick the best.
+ */
+gboolean
+_ostree_delta_compute_similar_objects (OstreeRepo *repo,
+ GVariant *from_commit,
+ GVariant *to_commit,
+ GHashTable *new_reachable_regfile_content,
+ guint similarity_percent_threshold,
+ GHashTable **out_modified_regfile_content,
+ GCancellable *cancellable,
+ GError **error)
+{
+ gboolean ret = FALSE;
+ gs_unref_hashtable GHashTable *ret_modified_regfile_content =
+ g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_ptr_array_unref);
+ gs_unref_ptrarray GPtrArray *from_sizes = NULL;
+ gs_unref_ptrarray GPtrArray *to_sizes = NULL;
+ guint i, j;
+ guint lower;
+ guint upper;
+
+ if (!build_content_sizenames_filtered (repo, from_commit, NULL,
+ &from_sizes,
+ cancellable, error))
+ goto out;
+
+ if (!build_content_sizenames_filtered (repo, to_commit, new_reachable_regfile_content,
+ &to_sizes,
+ cancellable, error))
+ goto out;
+
+ /* Iterate over all newly added objects, find objects which have
+ * similar basename and sizes.
+ *
+ * Because the arrays are sorted by size, we can maintain a `lower`
+ * bound on the original (from) objects to start searching.
+ */
+ lower = 0;
+ upper = from_sizes->len;
+ for (i = 0; i < to_sizes->len; i++)
+ {
+ OstreeDeltaContentSizeNames *to_sizenames = to_sizes->pdata[i];
+ const guint64 min_threshold = to_sizenames->size *
+ (1.0-similarity_percent_threshold/100.0);
+ const guint64 max_threshold = to_sizenames->size *
+ (1.0+similarity_percent_threshold/100.0);
+
+ /* Don't build candidates for the empty object */
+ if (to_sizenames->size == 0)
+ continue;
+
+ for (j = lower; j < upper; j++)
+ {
+ OstreeDeltaContentSizeNames *from_sizenames = from_sizes->pdata[j];
+
+ /* Don't build candidates for the empty object */
+ if (from_sizenames->size == 0)
+ continue;
+
+ if (from_sizenames->size < min_threshold)
+ {
+ lower++;
+ continue;
+ }
+
+ if (from_sizenames->size > max_threshold)
+ break;
+
+ if (!string_array_nonempty_intersection (from_sizenames->basenames, to_sizenames->basenames))
+ continue;
+
+ /* Only one candidate right now */
+ g_hash_table_insert (ret_modified_regfile_content,
+ g_strdup (to_sizenames->checksum),
+ g_strdup (from_sizenames->checksum));
+ break;
+ }
+ }
+
+ ret = TRUE;
+ gs_transfer_out_value (out_modified_regfile_content, &ret_modified_regfile_content);
+ out:
+ return ret;
+}
+
#include "otutil.h"
#include "ostree-varint.h"
+#define CONTENT_SIZE_SIMILARITY_THRESHOLD_PERCENT (30)
+
typedef struct {
guint64 uncompressed_size;
GPtrArray *objects;
gs_unref_bytes GBytes *tmp_to = NULL;
gs_unref_object GFileInfo *from_finfo = NULL;
gs_unref_object GFileInfo *to_finfo = NULL;
- OstreeRollsumMatches *matches;
+ OstreeRollsumMatches *matches = NULL;
ContentRollsum *ret_rollsum = NULL;
*out_rollsum = NULL;
return ret;
}
-
static gboolean
generate_delta_lowlatency (OstreeRepo *repo,
const char *from,
gboolean ret = FALSE;
GHashTableIter hashiter;
gpointer key, value;
- guint i;
OstreeStaticDeltaPartBuilder *current_part = NULL;
gs_unref_object GFile *root_from = NULL;
+ gs_unref_variant GVariant *from_commit = NULL;
gs_unref_object GFile *root_to = NULL;
- gs_unref_ptrarray GPtrArray *modified = NULL;
- gs_unref_ptrarray GPtrArray *removed = NULL;
- gs_unref_ptrarray GPtrArray *added = NULL;
+ gs_unref_variant GVariant *to_commit = NULL;
gs_unref_hashtable GHashTable *to_reachable_objects = NULL;
gs_unref_hashtable GHashTable *from_reachable_objects = NULL;
+ gs_unref_hashtable GHashTable *from_regfile_content = NULL;
gs_unref_hashtable GHashTable *new_reachable_metadata = NULL;
- gs_unref_hashtable GHashTable *new_reachable_content = NULL;
- gs_unref_hashtable GHashTable *modified_content_objects = NULL;
+ gs_unref_hashtable GHashTable *new_reachable_regfile_content = NULL;
+ gs_unref_hashtable GHashTable *new_reachable_symlink_content = NULL;
+ gs_unref_hashtable GHashTable *modified_regfile_content = NULL;
gs_unref_hashtable GHashTable *rollsum_optimized_content_objects = NULL;
gs_unref_hashtable GHashTable *content_object_to_size = NULL;
if (!ostree_repo_read_commit (repo, from, &root_from, NULL,
cancellable, error))
goto out;
- }
- if (!ostree_repo_read_commit (repo, to, &root_to, NULL,
- cancellable, error))
- goto out;
-
- /* Gather a filesystem level diff; when we do heuristics to ship
- * just parts of changed files, we can make use of this data.
- */
- modified = g_ptr_array_new_with_free_func ((GDestroyNotify) ostree_diff_item_unref);
- removed = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref);
- added = g_ptr_array_new_with_free_func ((GDestroyNotify) g_object_unref);
- if (!ostree_diff_dirs (OSTREE_DIFF_FLAGS_NONE, root_from, root_to, modified, removed, added,
- cancellable, error))
- goto out;
- modified_content_objects = g_hash_table_new_full (g_str_hash, g_str_equal,
- g_free, g_free);
- for (i = 0; i < modified->len; i++)
- {
- OstreeDiffItem *diffitem = modified->pdata[i];
- /* Theoretically, a target file could replace multiple source
- * files. That could happen if say a project changed from having
- * multiple binaries to one binary.
- *
- * In that case, we have last one wins behavior. For ELF rollsum
- * tends to be useless unless there's a large static data blob.
- */
- g_hash_table_replace (modified_content_objects,
- g_strdup (diffitem->target_checksum),
- g_strdup (diffitem->src_checksum));
- }
+ if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_COMMIT, from,
+ &from_commit, error))
+ goto out;
- if (from)
- {
if (!ostree_repo_traverse_commit (repo, from, 0, &from_reachable_objects,
cancellable, error))
goto out;
}
+ if (!ostree_repo_read_commit (repo, to, &root_to, NULL,
+ cancellable, error))
+ goto out;
+ if (!ostree_repo_load_variant (repo, OSTREE_OBJECT_TYPE_COMMIT, to,
+ &to_commit, error))
+ goto out;
+
if (!ostree_repo_traverse_commit (repo, to, 0, &to_reachable_objects,
cancellable, error))
goto out;
new_reachable_metadata = ostree_repo_traverse_new_reachable ();
- new_reachable_content = ostree_repo_traverse_new_reachable ();
+ new_reachable_regfile_content = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free);
+ new_reachable_symlink_content = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free);
g_hash_table_iter_init (&hashiter, to_reachable_objects);
while (g_hash_table_iter_next (&hashiter, &key, &value))
if (OSTREE_OBJECT_TYPE_IS_META (objtype))
g_hash_table_add (new_reachable_metadata, serialized_key);
else
- g_hash_table_add (new_reachable_content, serialized_key);
+ {
+ gs_unref_object GFileInfo *finfo = NULL;
+ GFileType ftype;
+
+ if (!ostree_repo_load_file (repo, checksum, NULL, &finfo, NULL,
+ cancellable, error))
+ goto out;
+
+ ftype = g_file_info_get_file_type (finfo);
+ if (ftype == G_FILE_TYPE_REGULAR)
+ g_hash_table_add (new_reachable_regfile_content, g_strdup (checksum));
+ else if (ftype == G_FILE_TYPE_SYMBOLIC_LINK)
+ g_hash_table_add (new_reachable_symlink_content, g_strdup (checksum));
+ else
+ g_assert_not_reached ();
+ }
}
-
- g_printerr ("modified: %u removed: %u added: %u\n",
- modified->len, removed->len, added->len);
- g_printerr ("new reachable: metadata=%u content=%u\n",
+
+ if (from_commit)
+ {
+ if (!_ostree_delta_compute_similar_objects (repo, from_commit, to_commit,
+ new_reachable_regfile_content,
+ CONTENT_SIZE_SIMILARITY_THRESHOLD_PERCENT,
+ &modified_regfile_content,
+ cancellable, error))
+ goto out;
+ }
+ else
+ modified_regfile_content = g_hash_table_new (g_str_hash, g_str_equal);
+
+ g_printerr ("modified: %u\n", g_hash_table_size (modified_regfile_content));
+ g_printerr ("new reachable: metadata=%u content regular=%u symlink=%u\n",
g_hash_table_size (new_reachable_metadata),
- g_hash_table_size (new_reachable_content));
+ g_hash_table_size (new_reachable_regfile_content),
+ g_hash_table_size (new_reachable_symlink_content));
/* We already ship the to commit in the superblock, don't ship it twice */
g_hash_table_remove (new_reachable_metadata,
g_free,
(GDestroyNotify) content_rollsums_free);
- g_hash_table_iter_init (&hashiter, modified_content_objects);
+ g_hash_table_iter_init (&hashiter, modified_regfile_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
const char *to_checksum = key;
g_printerr ("rollsum for %u/%u modified\n",
g_hash_table_size (rollsum_optimized_content_objects),
- g_hash_table_size (modified_content_objects));
+ g_hash_table_size (modified_regfile_content));
current_part = allocate_part (builder);
/* Scan for large objects, so we can fall back to plain HTTP-based
* fetch.
*/
- g_hash_table_iter_init (&hashiter, new_reachable_content);
+ g_hash_table_iter_init (&hashiter, new_reachable_regfile_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
- GVariant *serialized_key = key;
- const char *checksum;
- OstreeObjectType objtype;
+ const char *checksum = key;
guint64 uncompressed_size;
gboolean fallback = FALSE;
- ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
-
/* Skip content objects we rollsum'd */
if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
continue;
- if (!ostree_repo_load_object_stream (repo, objtype, checksum,
+ if (!ostree_repo_load_object_stream (repo, OSTREE_OBJECT_TYPE_FILE, checksum,
NULL, &uncompressed_size,
cancellable, error))
goto out;
if (fallback)
{
gs_free char *size = g_format_size (uncompressed_size);
- g_printerr ("fallback for %s (%s)\n",
- ostree_object_to_string (checksum, objtype), size);
+ g_printerr ("fallback for %s (%s)\n", checksum, size);
g_ptr_array_add (builder->fallback_objects,
- g_variant_ref (serialized_key));
+ ostree_object_name_serialize (checksum, OSTREE_OBJECT_TYPE_FILE));
g_hash_table_iter_remove (&hashiter);
}
}
- /* Now non-rollsummed content */
- g_hash_table_iter_init (&hashiter, new_reachable_content);
+ /* Now non-rollsummed regular file content */
+ g_hash_table_iter_init (&hashiter, new_reachable_regfile_content);
while (g_hash_table_iter_next (&hashiter, &key, &value))
{
- GVariant *serialized_key = key;
- const char *checksum;
- OstreeObjectType objtype;
-
- ostree_object_name_deserialize (serialized_key, &checksum, &objtype);
+ const char *checksum = key;
/* Skip content objects we rollsum'd */
if (g_hash_table_contains (rollsum_optimized_content_objects, checksum))
continue;
if (!process_one_object (repo, builder, ¤t_part,
- checksum, objtype,
+ checksum, OSTREE_OBJECT_TYPE_FILE,
+ cancellable, error))
+ goto out;
+ }
+
+ /* Now symlinks */
+ g_hash_table_iter_init (&hashiter, new_reachable_symlink_content);
+ while (g_hash_table_iter_next (&hashiter, &key, &value))
+ {
+ const char *checksum = key;
+
+ if (!process_one_object (repo, builder, ¤t_part,
+ checksum, OSTREE_OBJECT_TYPE_FILE,
cancellable, error))
goto out;
}
gboolean *out_have_all,
GCancellable *cancellable,
GError **error);
+
+typedef struct {
+ char *checksum;
+ guint64 size;
+ GPtrArray *basenames;
+} OstreeDeltaContentSizeNames;
+
+void _ostree_delta_content_sizenames_free (gpointer v);
+
+gboolean
+_ostree_delta_compute_similar_objects (OstreeRepo *repo,
+ GVariant *from_commit,
+ GVariant *to_commit,
+ GHashTable *new_reachable_regfile_content,
+ guint similarity_percent_threshold,
+ GHashTable **out_modified_regfile_content,
+ GCancellable *cancellable,
+ GError **error);
+
G_END_DECLS